#### Synthea Health: Data Analysis and Insights Generation
!pip install -r requirements.txt
#import the required funcions and packages for EDA
import pandas as pd
from utils.utils import read_data_parse_dates,get_demographic_plot
import json
import plotly.express as px
Import the required required cleaned and transformed data and also required functions
config_file_path = './config/config.json'
# Load the JSON file
with open(config_file_path, 'r') as file:
config = json.load(file)
# Access the value of 'input_folder'
output_folder = config.get('cleaned_data_path')
print("output Folder:", output_folder)
output Folder: C://Users//veena.vemula//Documents//GitHub//SDE-Skills-Test//Output
df_phr = read_data_parse_dates(output_folder,"diagnosis_data",["BIRTHDATE"])
df_patients = read_data_parse_dates(output_folder,"patients_cleaned",["BIRTHDATE"])
df_encounters = read_data_parse_dates(output_folder,"encounters_cleaned",["START_ENC","STOP_ENC"])
df_conditions = read_data_parse_dates(output_folder,"conditions_cleaned",["START_DIAG","STOP_DIAG"])
df_medications = read_data_parse_dates(output_folder,"medications_cleaned",["START_MED","STOP_MED"])
df_procedures = read_data_parse_dates(output_folder,"procedures_cleaned",["START_PROC","STOP_PROC"])
Read all the input files from the output folder
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
sns.histplot(df_phr['age_diag'], bins=10, kde=True, color='blue', alpha=0.7)
plt.title('Histogram of Patient Age Distribution', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
# Show the plot
plt.show()
Age distribution shows normal distribution with a most of the patients between age group 20 and 80. In the data there are patients above 100 age group - average life expectancy is 80 and there are more female patients compared to male in above 100 age group. Female are more prone to diseases than males but males life expectancy is shorter than females
patient_counts_by_diagnosis = df_phr.groupby('DESCRIPTION')['PATIENT_ID'].nunique()
patient_counts_sorted = patient_counts_by_diagnosis.sort_values(ascending=False)
print(patient_counts_sorted.head(30))
DESCRIPTION Stress (finding) 894 Viral sinusitis (disorder) 739 Limited social contact (finding) 661 Social isolation (finding) 653 Acute viral pharyngitis (disorder) 506 Not in labor force (finding) 501 Victim of intimate partner abuse (finding) 466 Body mass index 30+ - obesity (finding) 463 Acute bronchitis (disorder) 454 Prediabetes 341 Anemia (disorder) 324 Hypertension 292 Risk activity involvement (finding) 289 Normal pregnancy 217 Chronic sinusitis (disorder) 216 Has a criminal record (finding) 206 Miscarriage in first trimester 204 Chronic low back pain (finding) 195 Otitis media 160 Unhealthy alcohol drinking behavior (finding) 154 Housing unsatisfactory (finding) 154 Streptococcal sore throat (disorder) 146 Hyperlipidemia 138 Only received primary school education (finding) 123 Chronic neck pain (finding) 123 Sprain of ankle 110 Transport problems (finding) 107 Severe anxiety (panic) (finding 107 Lack of access to transportation (finding) 104 Suspected COVID-19 96 Name: PATIENT_ID, dtype: int64
top_15_diagnoses = df_phr.groupby('DESCRIPTION').size().sort_values(ascending = False).head(15)
# Replace diagnoses not in top 15 with "Others"
GroupedDiagnosis = df_phr['DESCRIPTION'].apply(
lambda x: x if x in top_15_diagnoses else 'Others'
)
GroupedDiagnosis.value_counts()
DESCRIPTION Others 5994 Stress (finding) 5137 Social isolation (finding) 1243 Viral sinusitis (disorder) 1233 Limited social contact (finding) 1200 Not in labor force (finding) 1077 Victim of intimate partner abuse (finding) 819 Acute viral pharyngitis (disorder) 678 Acute bronchitis (disorder) 571 Normal pregnancy 527 Body mass index 30+ - obesity (finding) 463 Risk activity involvement (finding) 392 Prediabetes 341 Anemia (disorder) 324 Hypertension 292 Chronic sinusitis (disorder) 219 Name: count, dtype: int64
top_15_diagnoses = GroupedDiagnosis.value_counts().head(30)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_15_diagnoses.values, y=top_15_diagnoses.index, palette='viridis')
plt.title('Top 15 Most Frequent Diagnoses', fontsize=16)
plt.xlabel('Number of Patients', fontsize=14)
plt.ylabel('Diagnosis Description', fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\3395157393.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=top_15_diagnoses.values, y=top_15_diagnoses.index, palette='viridis')
Above graph shows Stress , Social isolatio, viral sinisitus and Social contact are the top diagnosis with higher number of patients
# Filter procedures for the last 10 years
current_year = df_procedures['Year'].max()
last_10_years = current_year - 9
df_procedures_last_10_years = df_procedures[df_procedures['Year'] >= last_10_years]
# Shorten procedure descriptions for clarity
df_procedures_last_10_years["DESCRIPTION_SHORT"] = df_procedures_last_10_years['DESCRIPTION'].str.slice(0, 35)
# Group by year and procedure description
procedure_trends_last_10_years = (
df_procedures_last_10_years.groupby(['Year', 'DESCRIPTION_SHORT'])
.size()
.reset_index(name='Count')
)
# Create the line chart
fig = px.line(
procedure_trends_last_10_years,
x='Year',
y='Count',
color='DESCRIPTION_SHORT',
line_group='DESCRIPTION_SHORT',
title='Trends in Medical Procedures Over the Last 10 Years',
labels={'Year': 'Year', 'DESCRIPTION_SHORT': 'Procedure', 'Count': 'Frequency'}
)
# Save and show the visualization
fig.write_html("./output/data_visualization/trend_procedure_last_10_years.html")
fig.show()
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\467863593.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_procedures_last_10_years["DESCRIPTION_SHORT"] = df_procedures_last_10_years['DESCRIPTION'].str.slice(0, 35)
## scatter plot
df_proc_pat = pd.merge(df_procedures,df_patients,on ="PATIENT_ID",how="left")
df_proc_pat["Procedure_age"] = df_proc_pat["START_PROC"].dt.year-df_proc_pat["BIRTHDATE"].dt.year
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, float('inf')] # Add infinity for 90+
labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]
df_proc_pat['age_proc_group'] = pd.cut(df_proc_pat['Procedure_age'], bins=bins, labels=labels, right=False)
df_group_proc_age = df_proc_pat[["age_proc_group","DESCRIPTION"]].groupby(["age_proc_group","DESCRIPTION"]).size().reset_index().rename({0:"frequency"},axis=1)
df_group_proc_age[df_group_proc_age["frequency"]!=0]
fig = px.scatter(
df_group_proc_age,
x='age_proc_group', # X-axis: Age groups
y='frequency', # Y-axis: Frequency of procedures
color='DESCRIPTION', # Color by procedure type
size='frequency', # Bubble size (optional)
title='Frequency of Procedures by Age Group',
labels={'frequency': 'Frequency', 'age_group': 'Age Group'}
)
fig.write_html("./output/data_visualization/scatter_diag_freq.html")
fig.to_image(format = "png")
fig.show()
## social care needs were more in people over 60 age group
## immunotherapy was more in young people 10 to 20
## assessment of substance used procedure is seen across all group
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\501261827.py:7: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
Above graph gives many insights like Assesment of healtha nd social care needs are higher in age groups 20 to 70. Depression screening is also higher fro age groups 20 to 80. Orthopaedics procedure is higher between agegroups 40 to 60
## number of visit by time
#from encounter data if i see multiple entries for patientid 1 shall take all or count for this patient is 1
# Combine year and month into a datetime column
df_encounters['date'] = pd.to_datetime(df_encounters[['Year', 'Month']].assign(day=1)) # Assume day=1
# Count unique patient IDs over time
patients_over_time = df_encounters.groupby(['Year', 'Month']).agg(num_patients=('PATIENT_ID', 'nunique')).reset_index()
# Create a slider
fig = px.bar(
patients_over_time,
x='Month',
y='num_patients',
animation_frame='Year', # Enables the slider for years
title='Number of Patients Over Time by Year',
labels={'month': 'Month', 'num_patients': 'Number of Patients'},
range_y=[0, patients_over_time['num_patients'].max() + 5] # Adjust Y-axis range for clarity
)
# Customize layout
fig.update_layout(
xaxis=dict(tickmode='array', tickvals=list(range(1, 13)), ticktext=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']),
transition={'duration': 500} # Smooth transition between frames
)
fig.write_html("./output/data_visualization/slider_patients_visit_overtime.html")
fig.to_image(format = "png")
fig.show()
## spike of patient visist seen in 2021 covid years
Number of patients started to rise from 1970 - this may be becuase of lack of data. There is an hike in number of patients in 2020 may be because of Covid.There is an sudden increase in number of patients in 2014.
def get_demographic_top20_plot(df,x,y,title):
temp_data = df.groupby([x,y]).size().reset_index(name='Count')
fig = px.bar(
temp_data,
x='DESCRIPTION',
y='Count',
color=x,
barmode='group',
title=title,
labels={'diagnosis_id': 'Diagnosis ID', 'Count': 'Frequency'}
)
return fig
df_pat_cond["age_diag"] = df_pat_cond["START_DIAG"].dt.year-df_pat_cond["BIRTHDATE"].dt.year
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, float('inf')] # Add infinity for 90+
labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]
df_pat_cond['age_diag_group'] = pd.cut(df_pat_cond['age_diag'], bins=bins, labels=labels, right=False)
demo_diag_data = df_pat_cond[["PATIENT_ID","age_diag_group","STATE","MARITAL","RACE","ETHNICITY","GENDER","DESCRIPTION"]].drop_duplicates()
## dist of diag by age on top 50 diag
top20_diag = list(demo_diag_data["DESCRIPTION"].value_counts().reset_index().sort_values(by="DESCRIPTION",ascending=False).head(50)["DESCRIPTION"].values)
demo_diag_data_top20 = demo_diag_data[demo_diag_data["DESCRIPTION"].isin(top20_diag)]
fig = get_demographic_top20_plot(demo_diag_data_top20,"age_diag_group","DESCRIPTION","Distribution of top 20 diag by age_group")
fig.to_image(format = "png")
fig.show()
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\2424614841.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
## dist of diag by race on top diag
fig = get_demographic_top20_plot(demo_diag_data_top20,"RACE","DESCRIPTION","Distribution of top 20 diag by RACE")
fig.to_image(format = "png")
fig.show()
df_pat_med = pd.merge(df_medications, df_patients, on='PATIENT_ID', how='inner')
df_pat_med["age_med"] = df_pat_med["START_MED"].dt.year-df_pat_med["BIRTHDATE"].dt.year ## calcualte age at the time of medicine given
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, float('inf')] # Add infinity for 90+
labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]
df_pat_med['age_med_group'] = pd.cut(df_pat_med['age_med'], bins=bins, labels=labels, right=False)
demo_med_data = df_pat_med[["PATIENT_ID","age_group","STATE","MARITAL","RACE","ETHNICITY","GENDER","DESCRIPTION"]].drop_duplicates()
demo_med_data## removing duplicates on above combination
## dist of medication by age group on top 50 =medication
top20_diag = list(demo_med_data["DESCRIPTION"].value_counts().reset_index().sort_values(by="DESCRIPTION",ascending=False).head(50)["DESCRIPTION"].values)
demo_med_data_top20 = demo_med_data[demo_med_data["DESCRIPTION"].isin(top20_diag)]
fig = get_demographic_top20_plot(demo_med_data_top20,"age_group","DESCRIPTION","Distribution of top 20 medication by Medication")
fig.to_image(format = "png")
fig.show()
## by RACE
fig = get_demographic_top20_plot(demo_med_data_top20,"RACE","DESCRIPTION","Distribution of top 20 medication by RACE")
fig.to_image(format = "png")
fig.show()
# • Calculate mean, median, and standard deviation for numerical columns (e.g., age, frequency of visits).
aga_diag_mean = df_pat_cond["age_diag"].mean()
aga_diag_median = df_pat_cond["age_diag"].median()
aga_diag_std = df_pat_cond["age_diag"].std()
fig = px.bar( x=["Age_mean","Age_median","Age_std"],y=[aga_diag_mean,aga_diag_median,aga_diag_std], title="Mean Median Std of Age",
color_continuous_scale='Viridis')
#fig.write_html("./output/data_visualization/top_patient_by_visit.html")
fig.to_image(format = "png")
fig.show()
#for frequency of visit
freq_visit_mean = df_encounters["PATIENT_ID"].value_counts().reset_index()["count"].mean()
freq_visit_median = df_encounters["PATIENT_ID"].value_counts().reset_index()["count"].median()
freq_visit_std = df_encounters["PATIENT_ID"].value_counts().reset_index()["count"].std()
fig = px.bar( x=["freq_visit_mean","freq_visit__median","freq_visit_std"],y=[freq_visit_mean,freq_visit_median,freq_visit_std], title="Mean Median Std of frequency of visit",
color_continuous_scale='Viridis')
#fig.write_html("./output/data_visualization/top_patient_by_visit.html")
fig.to_image(format = "png")
fig.show()
# • Calculate mean, median, and standard deviation for numerical columns frequency of visits).
aga_diag_mean = df_pat_cond["age_diag"].mean()
aga_diag_median = df_pat_cond["age_diag"].median()
aga_diag_std = df_pat_cond["age_diag"].std()
fig = px.bar( x=["Age_mean","Age_median","Age_std"],y=[aga_diag_mean,aga_diag_median,aga_diag_std], title="Mean Median Std of Age",
color_continuous_scale='Viridis')
#fig.write_html("./output/data_visualization/top_patient_by_visit.html")
fig.to_image(format = "png")
fig.show()
#
Mean and median are almost same with small std deviation implies the data spread for Age and frequency of visit follows normal distribution